

import numpy as np
import pandas as pd
import pandas_profiling
import scipy as sc
import sklearn
from sklearn import preprocessing
from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import urllib.request
from pprint import pprint
import pandas_profiling
import shapash
from html_table_parser import HTMLTableParser
import warnings
warnings.filterwarnings("ignore")
Source of the data is Wikipedia
class web_scrap:
def __init__(self,url,number):
self.url = url
self.number = number
def Table_scrap(self):
request = urllib.request.Request(self.url)
open_link = urllib.request.urlopen(request)
xhtml = open_link.read().decode("utf-8")
table_content = HTMLTableParser()
table_content.feed(xhtml)
return table_content.tables[self.number]
Table_of_each_population= web_scrap("https://en.wikipedia.org/wiki/West_Bengal",3).Table_scrap()
Bengal_population_Dataframe = pd.DataFrame(Table_of_each_population[1:len(Table_of_each_population)+1],columns = Table_of_each_population[0])
Bengal_population_Dataframe
| District | Population | Growth rate | Sex ratio | Literacy | Density per square Kilometer | |
|---|---|---|---|---|---|---|
| 0 | North 24 Parganas | 10,009,781 | 12.04 | 955 | 84.06 | 2445 |
| 1 | South 24 Parganas | 8,161,961 | 18.17 | 956 | 77.51 | 819 |
| 2 | Purba Bardhaman | 4,835,432 | – | 945 | 74.73 | 890 |
| 3 | Paschim Bardhaman | 2,882,031 | – | 922 | 78.75 | 1800 |
| 4 | Murshidabad | 7,103,807 | 21.09 | 958 | 66.59 | 1334 |
| 5 | West Midnapore | 5,913,457 | 13.86 | 966 | 78.00 | 631 |
| 6 | Hooghly | 5,519,145 | 9.46 | 961 | 81.80 | 1753 |
| 7 | Nadia | 5,167,600 | 12.22 | 947 | 74.97 | 1316 |
| 8 | East Midnapore | 5,095,875 | 15.36 | 938 | 87.02 | 1081 |
| 9 | Howrah | 4,850,029 | 13.50 | 939 | 83.31 | 3306 |
| 10 | Kolkata | 4,496,694 | −1.67 | 908 | 86.31 | 24306 |
| 11 | Maldah | 3,988,845 | 21.22 | 944 | 61.73 | 1069 |
| 12 | Jalpaiguri | 3,872,846 | 13.87 | 953 | 73.25 | 622 |
| 13 | Alipurduar [a] | 1,700,000 | – | – | – | 400 |
| 14 | Bankura | 3,596,292 | 12.64 | 954 | 70.95 | 523 |
| 15 | Birbhum | 3,502,404 | 16.15 | 956 | 70.68 | 771 |
| 16 | North Dinajpur | 3,007,134 | 23.15 | 939 | 59.07 | 958 |
| 17 | Purulia | 2,930,115 | 15.52 | 957 | 64.48 | 468 |
| 18 | Cooch Behar | 2,819,086 | 13.71 | 942 | 74.78 | 832 |
| 19 | Darjeeling | 1,846,823 | 14.77 | 970 | 79.56 | 586 |
| 20 | Dakshin Dinajpur | 1,676,276 | 11.52 | 956 | 72.82 | 755 |
| 21 | Kalimpong [a] | 202,239 | – | – | – | 270 |
| 22 | Jhargram [a] | 1,136,548 | – | – | – | 374 |
## converting all columns into suitable data type
def variable_covertor(dataframe):
pop = []
for line in dataframe:
line = line.replace(",","")
pop.append((int(line)))
return pop
def convertor(dataframe):
lit = []
for i in dataframe:
i = i.replace("–","0.00")
lit.append(float(i))
return lit
Bengal_population_Dataframe["Literacy"] = convertor(Bengal_population_Dataframe["Literacy"])
Bengal_population_Dataframe["Sex ratio"] = convertor(Bengal_population_Dataframe["Sex ratio"])
Bengal_population_Dataframe['Population'] = variable_covertor(Bengal_population_Dataframe['Population'])
Bengal_population_Dataframe.profile_report()
class visualization:
def __init__(self,dataframe,xvalue,yvalue,xlabel,ylabel,title):
self.dataframe = dataframe
self.xvalue = xvalue
self.yvalue = yvalue
self.xlabel = xlabel
self.ylabel = ylabel
self.title = title
"""Creating the different visual charts for understanding"""
def boxplot_plot(self):
sns.boxplot(self.dataframe)
plt.xlabel(self.xlabel);
plt.ylabel(self.ylabel);
plt.title(self.title);
def countplot_without_percentage_representation(self):
sns.set_style("darkgrid")
sns.countplot(x=self.xvalue,data=self.dataframe)
plt.xlabel(self.xlabel);
plt.xticks(rotation=60)
plt.ylabel(self.ylabel);
plt.title(self.title);
plt.show()
def countplot_with_percentage_representation(self):
sns.set_style("darkgrid")
ax = sns.countplot(x=self.xvalue,data=self.dataframe)
total = len(self.dataframe)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width()
y = p.get_height()
ax.annotate(percentage, (x, y),ha='right')
plt.xlabel(self.xlabel);
plt.ylabel(self.ylabel);
plt.xticks(rotation=60)
plt.title(self.title);
plt.show()
"""Scatter plot to see the multivariate outlier"""
def scatterplot(self):
sns.scatterplot(x=self.xvalue,y=self.yvalue,data=self.dataframe)
plt.xlabel(self.xlabel);
plt.title(self.title);
plt.show()
def heatmap(self):
correlation_data = self.dataframe.corr()
sns.heatmap(correlation_data,annot=True)
plt.xlabel(self.xlabel);
plt.title(self.title);
plt.show()
def violinplot(self):
sns.violinplot(data=self.dataframe,x=self.xvalue,y=self.yvalue,split=True,inner="quart",linewidth=1,palette={'Yes':"b","No":".85"})
sns.despine(left=True)
plt.xlabel(self.xlabel);
plt.title(self.title);
plt.show()
"""Specific pie chart for weekly engagement of mundra port"""
def piechart(self):
fig, ax = plt.subplots()
District = self.dataframe[self.xvalue]
df_pie = pd.DataFrame(District)
df_pie["count"] = self.dataframe[self.yvalue]
labels = [i for i in df_pie[self.xvalue].unique()]
points = [j for j in df_pie["count"]]
add_points = df_pie["count"].sum()
add_points = int(add_points)
per = []
for j in points:
per.append(j/add_points)
percentages = [i for i in df_pie["count"].unique()]
explode=per
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', shadow=False, startangle=0,pctdistance=1.2,labeldistance=1.5);
ax.axis('equal')
ax.set_title(self.title)
ax.legend(frameon=False, bbox_to_anchor=(1.8,1.0));
def bar_chart(self):
objects = tuple(self.xvalue)
y_pos = np.arange(len(objects))
performance = list(self.yvalue)
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects,rotation=60)
plt.ylabel(self.ylabel)
plt.title(self.title)
plt.show()
visualization(Bengal_population_Dataframe,Bengal_population_Dataframe.iloc[:11,0],Bengal_population_Dataframe.iloc[:11,1],"","Population in Millions","District wise population").bar_chart()
Bengal_2016_result_analysis = web_scrap("https://en.wikipedia.org/wiki/2016_West_Bengal_Legislative_Assembly_election",11).Table_scrap()
Bengal_2016_result_analysis_df = pd.DataFrame(Bengal_2016_result_analysis[1:len(Bengal_2016_result_analysis)+1],columns = Bengal_2016_result_analysis[0])
Bengal_2016_result_analysis_df
| District | Total | AITC | LF | UPA | NDA | Others | |
|---|---|---|---|---|---|---|---|
| 0 | Cooch Behar | 9 | 8 | 1 | 0 | 0 | 0 |
| 1 | Jalpaiguri | 7 | 6 | 0 | 1 | 0 | 0 |
| 2 | Alipurduar | 5 | 4 | 0 | 0 | 1 | 0 |
| 3 | Darjeeling | 6 | 0 | 1 | 2 | 3 | 0 |
| 4 | North Dinajpur | 9 | 4 | 2 | 3 | 0 | 0 |
| 5 | South Dinajpur | 6 | 2 | 3 | 1 | 0 | 0 |
| 6 | Malda | 12 | 1 | 1 | 8 | 1 | 1 |
| 7 | Murshidabad | 22 | 4 | 4 | 14 | 0 | 0 |
| 8 | Nadia | 17 | 13 | 1 | 3 | 0 | 0 |
| 9 | North 24 Parganas | 33 | 27 | 3 | 3 | 0 | 0 |
| 10 | South 24 Parganas | 31 | 29 | 2 | 0 | 0 | 0 |
| 11 | Kolkata | 11 | 11 | 0 | 0 | 0 | 0 |
| 12 | Howrah | 16 | 15 | 0 | 1 | 0 | 0 |
| 13 | Hooghly | 18 | 16 | 1 | 1 | 0 | 0 |
| 14 | East Midnapore | 16 | 13 | 3 | 0 | 0 | 0 |
| 15 | West Midnapore | 19 | 17 | 0 | 1 | 1 | 0 |
| 16 | Purulia | 9 | 7 | 0 | 2 | 0 | 0 |
| 17 | Bankura | 12 | 7 | 3 | 2 | 0 | 0 |
| 18 | Burdwan | 25 | 19 | 5 | 1 | 0 | 0 |
| 19 | Birbhum | 11 | 9 | 1 | 1 | 0 | 0 |
| 20 | Total | 294 | 211 | 32 | 44 | 6 | 1 |
Bengal_2016_result_analysis_df["Total"] = convertor(Bengal_2016_result_analysis_df["Total"])
Bengal_2016_result_analysis_df["AITC"] = convertor(Bengal_2016_result_analysis_df["AITC"])
Bengal_2016_result_analysis_df["LF"] = convertor(Bengal_2016_result_analysis_df["LF"])
Bengal_2016_result_analysis_df["UPA"] = convertor(Bengal_2016_result_analysis_df["UPA"])
Bengal_2016_result_analysis_df["NDA"] = convertor(Bengal_2016_result_analysis_df["NDA"])
Bengal_2016_result_analysis_df["Others"] = convertor(Bengal_2016_result_analysis_df["Others"])
Bengal_2016_result_analysis_df
| District | Total | AITC | LF | UPA | NDA | Others | |
|---|---|---|---|---|---|---|---|
| 0 | Cooch Behar | 9.0 | 8.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1 | Jalpaiguri | 7.0 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2 | Alipurduar | 5.0 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | Darjeeling | 6.0 | 0.0 | 1.0 | 2.0 | 3.0 | 0.0 |
| 4 | North Dinajpur | 9.0 | 4.0 | 2.0 | 3.0 | 0.0 | 0.0 |
| 5 | South Dinajpur | 6.0 | 2.0 | 3.0 | 1.0 | 0.0 | 0.0 |
| 6 | Malda | 12.0 | 1.0 | 1.0 | 8.0 | 1.0 | 1.0 |
| 7 | Murshidabad | 22.0 | 4.0 | 4.0 | 14.0 | 0.0 | 0.0 |
| 8 | Nadia | 17.0 | 13.0 | 1.0 | 3.0 | 0.0 | 0.0 |
| 9 | North 24 Parganas | 33.0 | 27.0 | 3.0 | 3.0 | 0.0 | 0.0 |
| 10 | South 24 Parganas | 31.0 | 29.0 | 2.0 | 0.0 | 0.0 | 0.0 |
| 11 | Kolkata | 11.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 12 | Howrah | 16.0 | 15.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 13 | Hooghly | 18.0 | 16.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 14 | East Midnapore | 16.0 | 13.0 | 3.0 | 0.0 | 0.0 | 0.0 |
| 15 | West Midnapore | 19.0 | 17.0 | 0.0 | 1.0 | 1.0 | 0.0 |
| 16 | Purulia | 9.0 | 7.0 | 0.0 | 2.0 | 0.0 | 0.0 |
| 17 | Bankura | 12.0 | 7.0 | 3.0 | 2.0 | 0.0 | 0.0 |
| 18 | Burdwan | 25.0 | 19.0 | 5.0 | 1.0 | 0.0 | 0.0 |
| 19 | Birbhum | 11.0 | 9.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 20 | Total | 294.0 | 211.0 | 32.0 | 44.0 | 6.0 | 1.0 |
visualization(Bengal_2016_result_analysis_df,Bengal_2016_result_analysis_df.iloc[:20,0],Bengal_2016_result_analysis_df.iloc[:20,1],"","Number of seat be parties","Total seat district wise").bar_chart()
visualization(Bengal_2016_result_analysis_df,Bengal_2016_result_analysis_df.iloc[:20,0],Bengal_2016_result_analysis_df.iloc[:20,2],"","Number of seat","AITC performance in 2016").bar_chart()
visualization(Bengal_2016_result_analysis_df,Bengal_2016_result_analysis_df.iloc[:20,0],Bengal_2016_result_analysis_df.iloc[:20,3],"","Number of seat","Left performance in 2016").bar_chart()
visualization(Bengal_2016_result_analysis_df,Bengal_2016_result_analysis_df.iloc[:20,0],Bengal_2016_result_analysis_df.iloc[:20,4],"","Number of seat","UPA performance in 2016").bar_chart()
visualization(Bengal_2016_result_analysis_df,Bengal_2016_result_analysis_df.iloc[:20,0],Bengal_2016_result_analysis_df.iloc[:20,5],"","Number of seat","NDA performance in 2016").bar_chart()
Bengal_detail_result_analysis = pd.read_excel("Detailed Results.xlsx",header=2)
Bengal_detail_result_analysis
| Constituency No. | Constituency Name | Candidate Name | Candidate Sex | Candidate Age | Candidate Category | Party Name | VALID VOTES POLLED in General | VALID VOTES POLLED in Postal | Total Valid Votes | Total Electors | Total Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Mekliganj | ARGHYA ROY PRADHAN (BILU) | M | 44.0 | SC | AITC | 74608 | 215 | 74823 | 202689 | 180960 |
| 1 | 1 | Mekliganj | PARESH CHANDRA ADHIKARY | M | 63.0 | SC | AIFB | 67821 | 365 | 68186 | 202689 | 180960 |
| 2 | 1 | Mekliganj | DADHIRAM RAY | M | 32.0 | SC | BJP | 23313 | 42 | 23355 | 202689 | 180960 |
| 3 | 1 | Mekliganj | JYOTISH ROY | M | 66.0 | SC | BSP | 5650 | 0 | 5650 | 202689 | 180960 |
| 4 | 1 | Mekliganj | None of the Above | NaN | NaN | NaN | NOTA | 2331 | 2 | 2333 | 202689 | 180960 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2250 | 294 | Murarai | HAYATUNNINSH BIBI | F | 26.0 | GEN | BJP | 5341 | 4 | 5345 | 234055 | 199235 |
| 2251 | 294 | Murarai | None of the Above | NaN | NaN | NaN | NOTA | 1687 | 2 | 1689 | 234055 | 199235 |
| 2252 | 294 | Murarai | BELAL SEKH | M | 28.0 | GEN | BSP | 1376 | 0 | 1376 | 234055 | 199235 |
| 2253 | 294 | Murarai | MANGAL HEMRAM | M | 65.0 | ST | SUCI | 910 | 7 | 917 | 234055 | 199235 |
| 2254 | 294 | Murarai | SAMSHUL MIYA | M | 48.0 | GEN | RLD | 866 | 0 | 866 | 234055 | 199235 |
2255 rows × 12 columns
Bengal_detail_result_analysis["Candidate Category"].value_counts()
GEN 1196 SC 614 ST 151 Name: Candidate Category, dtype: int64
visualization(Bengal_detail_result_analysis,"Candidate Category"," ","Count","Number of Record","Category wise candidate").countplot_with_percentage_representation()
Bengal_detail_result_analysis["Candidate Sex"].value_counts()
M 1761 F 200 Name: Candidate Sex, dtype: int64
visualization(Bengal_detail_result_analysis,"Candidate Sex"," ","Count","Number of Record","Male vs Female candidates").countplot_with_percentage_representation()
Bengal_detail_result_analysis.columns
Index(['Constituency No.', 'Constituency Name', 'Candidate Name',
'Candidate Sex', 'Candidate Age', 'Candidate Category', ' Party Name',
' VALID VOTES POLLED in General', ' VALID VOTES POLLED in Postal',
' Total Valid Votes', 'Total Electors', 'Total Votes'],
dtype='object')
Bengal_detail_result_analysis[' Party Name'].unique()
array(['AITC', 'AIFB', 'BJP', 'BSP', 'NOTA', 'SUCI', 'AMB', 'IND', 'KPPU',
'CPM', 'INC', 'WPOI', 'RSP', 'JMM', 'CPI(ML)(L)', 'ABGL', 'GOJAM',
'GRAC', 'ABHM', 'BMUP', 'JDP', 'JD(U)', 'SP', 'SWJP', 'CPI',
'CPIM', 'HKRD', 'SDPI', 'RPI(A)', 'SHS', 'JESM', 'PDS', 'IUML',
'MPOI', 'LJP', 'GMM', 'NDPOI', 'TRMRPPI', 'AAAP', 'RAJSP', 'IUC',
'RLD', 'RJD', 'JD(S)', 'MHB', 'STPI', 'FDLP', 'NCP', 'BHNP',
'DSP(P)', 'AJSUP', 'JKP(N)', 'AKBJHP', 'JHAP', 'SP(I)', 'RCPI(R)'],
dtype=object)
Bengal_detail_result_analysis[['Constituency Name',' Total Valid Votes']].groupby('Constituency Name').plot.bar()
Constituency Name
Alipurduars AxesSubplot(0.125,0.125;0.775x0.755)
Amdanga AxesSubplot(0.125,0.125;0.775x0.755)
Amta AxesSubplot(0.125,0.125;0.775x0.755)
Arambagh AxesSubplot(0.125,0.125;0.775x0.755)
Asansol Dakshin AxesSubplot(0.125,0.125;0.775x0.755)
...
Udaynarayanpur AxesSubplot(0.125,0.125;0.775x0.755)
Uluberia Dakshin AxesSubplot(0.125,0.125;0.775x0.755)
Uluberia Purba AxesSubplot(0.125,0.125;0.775x0.755)
Uluberia Uttar AxesSubplot(0.125,0.125;0.775x0.755)
Uttarpara AxesSubplot(0.125,0.125;0.775x0.755)
Length: 293, dtype: object
Bengal_detail_result_analysis
| Constituency No. | Constituency Name | Candidate Name | Candidate Sex | Candidate Age | Candidate Category | Party Name | VALID VOTES POLLED in General | VALID VOTES POLLED in Postal | Total Valid Votes | Total Electors | Total Votes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Mekliganj | ARGHYA ROY PRADHAN (BILU) | M | 44.0 | SC | AITC | 74608 | 215 | 74823 | 202689 | 180960 |
| 1 | 1 | Mekliganj | PARESH CHANDRA ADHIKARY | M | 63.0 | SC | AIFB | 67821 | 365 | 68186 | 202689 | 180960 |
| 2 | 1 | Mekliganj | DADHIRAM RAY | M | 32.0 | SC | BJP | 23313 | 42 | 23355 | 202689 | 180960 |
| 3 | 1 | Mekliganj | JYOTISH ROY | M | 66.0 | SC | BSP | 5650 | 0 | 5650 | 202689 | 180960 |
| 4 | 1 | Mekliganj | None of the Above | NaN | NaN | NaN | NOTA | 2331 | 2 | 2333 | 202689 | 180960 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2250 | 294 | Murarai | HAYATUNNINSH BIBI | F | 26.0 | GEN | BJP | 5341 | 4 | 5345 | 234055 | 199235 |
| 2251 | 294 | Murarai | None of the Above | NaN | NaN | NaN | NOTA | 1687 | 2 | 1689 | 234055 | 199235 |
| 2252 | 294 | Murarai | BELAL SEKH | M | 28.0 | GEN | BSP | 1376 | 0 | 1376 | 234055 | 199235 |
| 2253 | 294 | Murarai | MANGAL HEMRAM | M | 65.0 | ST | SUCI | 910 | 7 | 917 | 234055 | 199235 |
| 2254 | 294 | Murarai | SAMSHUL MIYA | M | 48.0 | GEN | RLD | 866 | 0 | 866 | 234055 | 199235 |
2255 rows × 12 columns
Bengal_detail_result_analysis.columns
Index(['Constituency No.', 'Constituency Name', 'Candidate Name',
'Candidate Sex', 'Candidate Age', 'Candidate Category', ' Party Name',
' VALID VOTES POLLED in General', ' VALID VOTES POLLED in Postal',
' Total Valid Votes', 'Total Electors', 'Total Votes'],
dtype='object')
visualization(Bengal_detail_result_analysis['Candidate Age'],"","","Candidates age","","Candidates age anaysis").boxplot_plot()
Performance_of_political_parties_2016 = pd.read_excel("Performance of Poltical Parties.xlsx",header = 1)
Performance_of_political_parties_2016.head(5)
| Party Type | Party Name | Contested | Won | Forfitted | Votes | Total Valid Votes | Votes in % | Total Valid Votes in Seat Contested | VOTE % IN SEATS CONTESTED | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | N | Bharatiya Janata Party | 291 | 3 | 263 | 5555134 | 54697791 | 10.156048 | 54236459 | 10.242435 |
| 1 | N | Bahujan Samaj Party | 161 | 0 | 161 | 300294 | 54697791 | 0.549006 | 29810643 | 1.007338 |
| 2 | N | Communist Party of India | 11 | 1 | 0 | 791925 | 54697791 | 1.447819 | 2124882 | 37.269128 |
| 3 | N | Communist Party of India (Marxist) | 148 | 26 | 0 | 10802058 | 54697791 | 19.748618 | 28059913 | 38.496406 |
| 4 | N | Indian National Congress | 92 | 44 | 4 | 6700938 | 54697791 | 12.250838 | 16660837 | 40.219696 |
def piechart_party(dataframe):
fig, ax = plt.subplots()
seat_won_by_parties = ["All India Trinamool Congress","Indian National Congress","Communist Party of India (Marxist)","Bharatiya Janata Party"]
df_pie = pd.DataFrame(seat_won_by_parties,columns =["seat_won_by_parties"] )
seat_won = dataframe["Won"].sort_values(ascending= False)
seat_won_by_parties_top4 = seat_won[:4]
seat = seat_won_by_parties_top4
df_pie["count"] = list(seat)
labels = [i for i in df_pie["seat_won_by_parties"]]
points = [j for j in df_pie["count"]]
add_points = df_pie["count"].sum()
per = []
for j in points:
per.append(j/add_points)
percentages = [i for i in df_pie["count"]]
explode=per
ax.pie(percentages, explode=explode, labels=labels, autopct='%1.0f%%', shadow=False, startangle=0,pctdistance=1.2,labeldistance=1.32);
ax.axis('equal')
ax.set_title("Parties final seat won 2016")
ax.legend(frameon=False, bbox_to_anchor=(1.5,1.0));
piechart_party(Performance_of_political_parties_2016)
Bengal_2021_twitter_analysis = pd.read_csv("Bengal_sentiment.csv",encoding = "ISO-8859-1")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Bengal_2021_twitter_analysis.head()
| Statment | Review BJP | Review AITC | |
|---|---|---|---|
| 0 | Over 125 BJP workers have been massacred in Be... | positive | negative |
| 1 | West Bengal has long suffered due to @mamataof... | positive | negative |
| 2 | EVMs and VVPATs found at the residence of a TM... | positive | negative |
| 3 | Amit Shah said if BJP doesn't win in Bengal, i... | negative | positive |
| 4 | TMC candidate Manoranjan Byapari, a former ref... | negative | positive |
Bengal_2021_twitter_analysis.columns = ["Tweets","Review BJP","Review AITC"]
Bengal_2021_twitter_analysis
| Tweets | Review BJP | Review AITC | |
|---|---|---|---|
| 0 | Over 125 BJP workers have been massacred in Be... | positive | negative |
| 1 | West Bengal has long suffered due to @mamataof... | positive | negative |
| 2 | EVMs and VVPATs found at the residence of a TM... | positive | negative |
| 3 | Amit Shah said if BJP doesn't win in Bengal, i... | negative | positive |
| 4 | TMC candidate Manoranjan Byapari, a former ref... | negative | positive |
| ... | ... | ... | ... |
| 94 | Today's hired high-tech election strategists a... | positive | negative |
| 95 | National media's obsession to term anything an... | negative | positive |
| 96 | Today election is tailor only when they will t... | positive | negative |
| 97 | So many powerful people from Hyderabad, up, bi... | negative | positive |
| 98 | Election battle in #WestBengalAssemblyElection... | NaN | NaN |
99 rows × 3 columns
Bengal_2021_twitter_analysis["Review BJP"].value_counts()
positive 54 negative 39 Name: Review BJP, dtype: int64
visualization(Bengal_2021_twitter_analysis,"Review BJP","","Count","Number of Record","Tweets in favour Vs Tweets Aganist").countplot_with_percentage_representation()
from PIL import Image as PILImage
import base64, io, IPython
def image_import(image):
image = PILImage.open(image)
output = io.BytesIO()
image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
return IPython.display.HTML(html)
image_import("Modi_ji.png")
Bengal_2021_twitter_analysis['Tweets']=[word.lower() for word in Bengal_2021_twitter_analysis['Tweets']]
Reviews_list = [word.lower() for word in Bengal_2021_twitter_analysis['Tweets']]
Reviews_list[0]
'over 125 bjp workers have been massacred in bengal at the instigation of @mamataofficial in order to further her political motives in west bengal and to make space for her nephew. is didi going to run away from this reality and hide behind closed doors as always?'
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\mayan\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\mayan\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
True
# Fist I tokkenize in sent tokken
Sentence_tokkenize = [nltk.sent_tokenize(sentence) for sentence in Reviews_list]
words_tokkenize =[nltk.word_tokenize(words) for words in Reviews_list]
from nltk.corpus import stopwords
nltk.download("stopwords")
stopword_all = stopwords.words('english')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\mayan\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Tagged_words = [nltk.pos_tag(words_pos) for words_pos in words_tokkenize ]
# Noun tagging
def noun_tagging(lst):
Noun_tag_pos =[]
for j in range(len(lst)):
for noun_tg in lst[j]:
if noun_tg[1]=="NN" or noun_tg[1]=="NNP" or noun_tg[1]=="NNPS" or noun_tg[1]=="NNS":
Noun_tag_pos.append(noun_tg)
return Noun_tag_pos
noun_tagging(Tagged_words[:1])
[('workers', 'NNS'),
('bengal', 'NN'),
('instigation', 'NN'),
('mamataofficial', 'NN'),
('order', 'NN'),
('motives', 'NNS'),
('bengal', 'NN'),
('space', 'NN'),
('nephew', 'NN'),
('reality', 'NN'),
('hide', 'NN'),
('doors', 'NNS')]
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import RegexpTokenizer
Rg_tokenizer = RegexpTokenizer(r'\w+')
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
corpus = []
for i in range(0, len(Tagged_words)):
review = re.sub('[^a-zA-Z]', ' ', Bengal_2021_twitter_analysis['Tweets'][i])
review = review.lower()
review = review.split()
all_stopwords = stopwords.words('english')
review = [lemmatizer.lemmatize(word,'v') for word in review if not word in set(all_stopwords)]
review = ' '.join(review)
corpus.append(review)
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\mayan\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\mayan\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
# Removing all punctuations
import string
corpus_upd= [''.join(c for c in s if c not in string.punctuation) for s in corpus]
word_tokken_1=[nltk.word_tokenize(e) for e in corpus_upd]
import gensim
from gensim.utils import simple_preprocess
processed_docs = []
for doc in word_tokken_1:
processed_docs.append((doc))
print(processed_docs[:2])
[['bjp', 'workers', 'massacre', 'bengal', 'instigation', 'mamataofficial', 'order', 'political', 'motives', 'west', 'bengal', 'make', 'space', 'nephew', 'didi', 'go', 'run', 'away', 'reality', 'hide', 'behind', 'close', 'doors', 'always'], ['west', 'bengal', 'long', 'suffer', 'due', 'mamataofficial', 'thirst', 'power', 'power', 'hungry', 'cm', 'sacrifice', 'need', 'people', 'order', 'restrict', 'central', 'government', 'good', 'scheme', 'implement', 'west', 'bengal', 'bengal', 'reject', 'didi']]
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=1, no_above=0.1, keep_n= 100)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[21]
[(29, 1), (40, 1), (67, 1), (70, 1), (71, 1)]
document_num = 1
bow_doc_x = bow_corpus[document_num]
for i in range(len(bow_doc_x)):
print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
dictionary[bow_doc_x[i][0]],
bow_doc_x[i][1]))
Word 0 ("order") appears 1 time.
Word 3 ("central") appears 1 time.
Word 4 ("cm") appears 1 time.
Word 5 ("good") appears 1 time.
Word 6 ("government") appears 1 time.
Word 7 ("long") appears 1 time.
Word 8 ("reject") appears 1 time.
Word 9 ("scheme") appears 1 time.
lda_model = gensim.models.LdaMulticore(bow_corpus,
num_topics = 10,
id2word = dictionary,
passes = 10,
workers = 2)
for idx, topic in lda_model.print_topics(-1):
print("Topic: {} \nWords: {}".format(idx, topic ))
print("\n")
Topic: 0 Words: 0.113*"vote" + 0.095*"show" + 0.091*"think" + 0.048*"evm" + 0.048*"leader" + 0.048*"officer" + 0.046*"like" + 0.030*"cm" + 0.025*"home" + 0.025*"today" Topic: 1 Words: 0.074*"soldier" + 0.074*"country" + 0.064*"lose" + 0.050*"shah" + 0.043*"pulwama" + 0.043*"compromise" + 0.042*"assembly" + 0.041*"national" + 0.041*"leave" + 0.032*"attack" Topic: 2 Words: 0.104*"politics" + 0.046*"corruption" + 0.046*"next" + 0.046*"time" + 0.046*"know" + 0.045*"also" + 0.045*"vote" + 0.044*"see" + 0.032*"govt" + 0.031*"banerjee" Topic: 3 Words: 0.071*"want" + 0.070*"govt" + 0.057*"work" + 0.057*"poll" + 0.056*"know" + 0.043*"inr" + 0.043*"political" + 0.043*"workers" + 0.043*"culture" + 0.029*"corruption" Topic: 4 Words: 0.069*"nandigram" + 0.069*"years" + 0.061*"mamata" + 0.043*"clear" + 0.043*"lose" + 0.043*"accept" + 0.040*"us" + 0.039*"tell" + 0.035*"challenge" + 0.035*"di" Topic: 5 Words: 0.078*"time" + 0.073*"india" + 0.041*"lakh" + 0.041*"major" + 0.041*"also" + 0.041*"commission" + 0.041*"central" + 0.041*"vote" + 0.041*"leader" + 0.041*"elections" Topic: 6 Words: 0.121*"nandigram" + 0.067*"banerjee" + 0.064*"mamata" + 0.061*"cm" + 0.050*"political" + 0.050*"compromise" + 0.050*"order" + 0.042*"day" + 0.042*"commission" + 0.026*"like" Topic: 7 Words: 0.091*"sonar" + 0.081*"bangla" + 0.080*"youth" + 0.071*"restore" + 0.071*"fulfil" + 0.071*"smile" + 0.071*"farmers" + 0.071*"aspirations" + 0.071*"belief" + 0.071*"poor" Topic: 8 Words: 0.132*"hindi" + 0.109*"birthday" + 0.109*"speak" + 0.055*"wave" + 0.046*"call" + 0.046*"good" + 0.045*"like" + 0.045*"workers" + 0.043*"reject" + 0.024*"minister" Topic: 9 Words: 0.071*"get" + 0.055*"nd" + 0.052*"elections" + 0.048*"result" + 0.048*"government" + 0.048*"development" + 0.048*"intensify" + 0.048*"formation" + 0.048*"may" + 0.042*"high"
from gensim.models.coherencemodel import CoherenceModel
coherence_model_sc = CoherenceModel(model=lda_model, texts=word_tokken_1, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_sc.get_coherence()
print('\nCoherence Score: ', coherence_lda)
Coherence Score: 0.4004311998140424
Candidate_analysis_2021 = pd.read_csv("west_bengal.csv")
Candidate_analysis_2021.profile_report()
Candidate_analysis_2021
| candidate | constituency | party | criminal_cases | education | total_assets | liabilities | |
|---|---|---|---|---|---|---|---|
| 0 | Abdul Hai Mallik | ONDA | IND | 0 | 8th Pass | 160171 | 0 |
| 1 | Abdur Razzak Molla | FALTA | INC | 2 | 10th Pass | 2650450 | 0 |
| 2 | Abhijit Bhattacharya | PURULIA | IND | 0 | Post Graduate | 4439532 | 375000 |
| 3 | Abir Chandra Mandal | CHHATNA | IND | 0 | Graduate Professional | 809000 | 0 |
| 4 | Adhikari Suvendu | NANDIGRAM | BJP | 1 | Post Graduate | 10552749 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 562 | Uttam Naskar | BISHNUPUR (SC) | IND | 0 | 10th Pass | 511291 | 0 |
| 563 | Uttara Singha Hazra | GARBETA | AITC | 1 | Graduate | 17710285 | 6348523 |
| 564 | Vibekananda Mondal | ULUBERIA UTTAR (SC) | IND | 0 | 12th Pass | 11953 | 0 |
| 565 | Vivekananda Bauri | RAGHUNATHPUR (SC) | BJP | 0 | Graduate Professional | 2030856 | 0 |
| 566 | Yamini Kanta Mandi | MANBAZAR (ST) | CPI(M) | 0 | Post Graduate | 3396148 | 1000000 |
567 rows × 7 columns
Candidate_analysis_2021["education"].value_counts()
Graduate 145 12th Pass 106 Post Graduate 105 10th Pass 94 8th Pass 54 Graduate Professional 33 Doctorate 8 Others 7 5th Pass 7 Literate 5 Illiterate 3 Name: education, dtype: int64
visualization(Candidate_analysis_2021,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
IND_candidates_education = Candidate_analysis_2021[(Candidate_analysis_2021["party"]=="IND")]
IND_candidates_education["education"].value_counts()
10th Pass 38 12th Pass 30 Graduate 21 8th Pass 20 Post Graduate 16 5th Pass 4 Illiterate 3 Literate 2 Graduate Professional 2 Doctorate 1 Name: education, dtype: int64
visualization(IND_candidates_education,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
visualization(IND_candidates_education,"criminal_cases","","Count","Number of Record","Criminal record of candidate").countplot_with_percentage_representation()
AITC_candidates_education = Candidate_analysis_2021[(Candidate_analysis_2021["party"]=="AITC")]
AITC_candidates_education["education"].value_counts()
Graduate 26 Post Graduate 17 12th Pass 15 Graduate Professional 12 10th Pass 8 8th Pass 6 Doctorate 3 Others 2 5th Pass 1 Name: education, dtype: int64
visualization(AITC_candidates_education,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
visualization(AITC_candidates_education,"criminal_cases","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
SUCI_candidates_education = Candidate_analysis_2021[(Candidate_analysis_2021["party"]=="SUCI(C)")]
SUCI_candidates_education["education"].value_counts()
Graduate 21 12th Pass 18 Post Graduate 13 10th Pass 11 8th Pass 5 Others 2 Graduate Professional 2 Literate 1 Doctorate 1 Name: education, dtype: int64
visualization(SUCI_candidates_education,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
CPI_candidates_education = Candidate_analysis_2021[(Candidate_analysis_2021["party"]=="CPI")]
CPI_candidates_education["education"].value_counts()
Post Graduate 4 12th Pass 1 Graduate 1 Name: education, dtype: int64
visualization(CPI_candidates_education,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
BJP_candidates_education = Candidate_analysis_2021[(Candidate_analysis_2021["party"]=="BJP")]
BJP_candidates_education["education"].value_counts()
Graduate 27 Post Graduate 20 12th Pass 13 10th Pass 13 8th Pass 8 Graduate Professional 6 Doctorate 2 Others 1 Name: education, dtype: int64
visualization(BJP_candidates_education,"education","","Count","Number of Record","Education wise candidate").countplot_with_percentage_representation()
visualization(BJP_candidates_education,"criminal_cases","","Count","Number of Record","Criminal record wise candidate").countplot_with_percentage_representation()
Growth_rate_Bengal = web_scrap("https://thewire.in/political-economy/west-bengal-budget-gdp-growth-employment",0).Table_scrap()
Bengal_growth_Dataframe = pd.DataFrame(Growth_rate_Bengal[1:len(Growth_rate_Bengal)+1],columns = Growth_rate_Bengal[0])
Bengal_growth_Dataframe
| GSDP (Constant) Growth % WB Budget 2018-19 | GSDP (Constant) Growth % – CSO | ||
|---|---|---|---|
| 0 | 2012-13 | 5.6 | 4.2 |
| 1 | 2013-14 | 6.5 | 3.0 |
| 2 | 2014-15 | 9.0 | 2.8 |
| 3 | 2015-16 | 5.9 | 6.1 |
| 4 | 2016-17 | 8.0 | 7.9 |
| 5 | 2017-18 | 11.5 | 9.1 |
Bengal_growth_Dataframe["GSDP (Constant) Growth % WB Budget 2018-19"] = convertor(Bengal_growth_Dataframe["GSDP (Constant) Growth % WB Budget 2018-19"])
Bengal_growth_Dataframe["GSDP (Constant) Growth % – CSO"] = convertor(Bengal_growth_Dataframe["GSDP (Constant) Growth % – CSO"])
Bengal_growth_Dataframe.plot.line()
<AxesSubplot:>
Train_data = pd.read_excel("Bengal_train_data.xlsx")
Train_data
| candidate | constituency | party | Winning prediction before election | Final Result | |
|---|---|---|---|---|---|
| 0 | ARGHYA ROY PRADHAN (BILU) | Mekliganj | AITC | 0 | 1.0 |
| 1 | PARESH CHANDRA ADHIKARY | Mekliganj | AIFB | 1 | 0.0 |
| 2 | DADHIRAM RAY | Mekliganj | BJP | 0 | 0.0 |
| 3 | JYOTISH ROY | Mekliganj | BSP | 0 | 0.0 |
| 4 | None of the Above | Mekliganj | NOTA | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... |
| 2250 | HAYATUNNINSH BIBI | Murarai | BJP | 0 | 0.0 |
| 2251 | None of the Above | Murarai | NOTA | 0 | 0.0 |
| 2252 | BELAL SEKH | Murarai | BSP | 0 | 0.0 |
| 2253 | MANGAL HEMRAM | Murarai | SUCI | 0 | 0.0 |
| 2254 | SAMSHUL MIYA | Murarai | RLD | 0 | 0.0 |
2255 rows × 5 columns
label_Encoder = preprocessing.LabelEncoder()
Train_data["Final Result"]= label_Encoder.fit_transform(Train_data["Final Result"])
class label_enconding:
def __init__(self,dataframe):
self.dataframe = dataframe
def features(self):
important_columns = ["candidate","constituency","party","Winning prediction before election","Final Result"]
label_Encoder = preprocessing.LabelEncoder()
self.dataframe['candidate']= label_Encoder.fit_transform(self.dataframe['candidate'])
self.dataframe['constituency'] = label_Encoder.fit_transform(self.dataframe['constituency'])
self.dataframe['party']= label_Encoder.fit_transform(self.dataframe['party'])
return self.dataframe
Data_frame_upd = label_enconding(Train_data).features()
Data_frame_upd
| candidate | constituency | party | Winning prediction before election | Final Result | |
|---|---|---|---|---|---|
| 0 | 194 | 194 | 4 | 0 | 1 |
| 1 | 1195 | 194 | 3 | 1 | 0 |
| 2 | 468 | 194 | 9 | 0 | 0 |
| 3 | 803 | 194 | 11 | 0 | 0 |
| 4 | 1168 | 194 | 39 | 0 | 0 |
| ... | ... | ... | ... | ... | ... |
| 2250 | 704 | 201 | 9 | 0 | 0 |
| 2251 | 1168 | 201 | 39 | 0 | 0 |
| 2252 | 330 | 201 | 11 | 0 | 0 |
| 2253 | 966 | 201 | 52 | 0 | 0 |
| 2254 | 1465 | 201 | 44 | 0 | 0 |
2255 rows × 5 columns
X_fold_nn = Data_frame_upd.iloc[:,:-1].values
y_fold_nn = Data_frame_upd.iloc[:,-1].values
kf = KFold(n_splits = 7,random_state = 42,shuffle = True)
for train_index,test_index in kf.split(X_fold_nn):
X_train_fold_nn,X_test_fold_nn = X_fold_nn[train_index],X_fold_nn[test_index]
y_train_fold_nn,y_test_fold_nn = y_fold_nn[train_index],y_fold_nn[test_index]
from sklearn.linear_model import LogisticRegression
Classifier_logisticRegression = LogisticRegression(C=0.1,random_state=42)
Classifier_logisticRegression.fit(X_train_fold_nn,y_train_fold_nn)
LogisticRegression(C=0.1, random_state=42)
y_pred = Classifier_logisticRegression.predict(X_test_fold_nn)
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
acc_lr = accuracy_score(y_test_fold_nn,y_pred)
print("accuracy {}".format(acc_lr))
accuracy 0.8975155279503105
from sklearn.ensemble import RandomForestClassifier
Classifier_kfold_rf =RandomForestClassifier(random_state=42)
Classifier_kfold_rf.fit(X_train_fold_nn,y_train_fold_nn)
RandomForestClassifier(random_state=42)
y_pred_kfold_RF = Classifier_kfold_rf.predict(X_test_fold_nn)
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
cm_kfold_RF=confusion_matrix(y_pred_kfold_RF,y_test_fold_nn)
accuracy_kfold_RF = accuracy_score(y_pred_kfold_RF,y_test_fold_nn)
precision_kfold_RF =precision_score(y_pred_kfold_RF,y_test_fold_nn)
recall_kfold_RF = recall_score(y_pred_kfold_RF,y_test_fold_nn)
F1score_kfold_RF =f1_score(y_pred_kfold_RF,y_test_fold_nn)
print(cm_kfold_RF)
print("Accuracy{} and Precission:{} and Recall:{} and F1score:{}".format(accuracy_kfold_RF,precision_kfold_RF,recall_kfold_RF,F1score_kfold_RF))
[[266 13] [ 9 34]] Accuracy0.9316770186335404 and Precission:0.723404255319149 and Recall:0.7906976744186046 and F1score:0.7555555555555555
from xgboost import XGBClassifier
classifier_xgboost = XGBClassifier()
classifier_xgboost.fit(X_train_fold_nn,y_train_fold_nn)
[18:44:56] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=12, num_parallel_tree=1,
objective='multi:softprob', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=None, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
y_pred_kfold_xgboost = classifier_xgboost.predict(X_test_fold_nn)
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
cm_kfold_xg=confusion_matrix(y_pred_kfold_xgboost,y_test_fold_nn)
accuracy_kfold_xg = accuracy_score(y_pred_kfold_xgboost,y_test_fold_nn)
precision_kfold_xg =precision_score(y_pred_kfold_xgboost,y_test_fold_nn)
recall_kfold_xg = recall_score(y_pred_kfold_xgboost,y_test_fold_nn)
F1score_kfold_xg =f1_score(y_pred_kfold_xgboost,y_test_fold_nn)
print(cm_kfold_xg)
print("Accuracy{} and Precission:{} and Recall:{} and F1score:{}".format(accuracy_kfold_xg,precision_kfold_xg,recall_kfold_xg,F1score_kfold_xg))
[[268 8] [ 7 39]] Accuracy0.953416149068323 and Precission:0.8297872340425532 and Recall:0.8478260869565217 and F1score:0.8387096774193549
Test_data = pd.read_excel("Bengal_test.xlsx")
Test_data_dum = pd.read_excel("Bengal_test.xlsx")
Test_data.head()
| candidate | constituency | party | Winning prediction before election | |
|---|---|---|---|---|
| 0 | 0 | 4 | 11 | 0 |
| 1 | 7 | 4 | 4 | 0 |
| 2 | 9 | 4 | 13 | 0 |
| 3 | 10 | 4 | 1 | 1 |
| 4 | 13 | 4 | 8 | 0 |
Test_data_cp = Test_data
Data_frame_upd_test = label_enconding(Test_data_cp).features()
y_pred = Classifier_kfold_rf.predict(Data_frame_upd_test.iloc[:,:].values)
y_pred
array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1], dtype=int64)
Test_data_dum["predicted_result"] = list(y_pred)
Test_data_dum.head()
| candidate | constituency | party | Winning prediction before election | predicted_result | |
|---|---|---|---|---|---|
| 0 | Abdul Hai Mallik | Onda | IND | 0 | 0 |
| 1 | Amarnath Shakha | Onda | BJP | 0 | 0 |
| 2 | Apurba Mondal | Onda | SUCI(C) | 0 | 0 |
| 3 | Arup Kumar Khan | Onda | AITC | 1 | 1 |
| 4 | Bikash Patra | Onda | CPI(ML) Red Star | 0 | 0 |
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test_fold_nn))]
rf_probabs = Classifier_kfold_rf.predict_proba(X_test_fold_nn)
rf_probabs =rf_probabs[:,1]
ns_auc = roc_auc_score(y_test_fold_nn, ns_probs)
rf_auc = roc_auc_score(y_test_fold_nn, rf_probabs)
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('RF: ROC AUC=%.3f' % (rf_auc))
No Skill: ROC AUC=0.500 RF: ROC AUC=0.969
ns_fpr, ns_tpr, _ = roc_curve(y_test_fold_nn, ns_probs)
rf_fpr, rf_tpr, _ = roc_curve(y_test_fold_nn, rf_probabs)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill');
plt.plot(rf_fpr, rf_tpr, marker='.', label='Randomforest');
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show();